import requests
import pandas as pd
import io
import tweepy
import json
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as dates
from IPython.core.display import Image, display
import urllib.request
Gather data from local csv file.
twitter_archive_df = pd.read_csv('../data/twitter-archive-enhanced.csv')
twitter_archive_df.head(20)
twitter_archive_df.shape
Gather data from a URL with the requests library.
r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
r.status_code
r.headers['content-type']
raw_data = r.content
prediction_df = pd.read_csv(io.StringIO(raw_data.decode('utf-8')), sep='\t')
prediction_df.tail(2)
# write tsv
prediction_df.to_csv('../data/image-predictions.tsv', sep='\t')
Gather data via a twitter API. Key and tokens are removed.
consumer_key = 'XXXXXX'
consumer_secret = 'XXXXXX'
access_token ='XXXXXX'
access_token_secret = 'XXXXXX'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
tweets = []
for tweet_id in list(twitter_archive_df['tweet_id']):
try:
tweet = api.get_status(tweet_id) #, tweet_mode='extended')
tweets.append(tweet)
except tweepy.TweepError as e:
print(e.response.text)
len(tweets)
# extract json content of every tweet into a list
my_list_of_dicts = []
for each_json_tweet in tweets:
my_list_of_dicts.append(each_json_tweet._json)
# write every tweet's json into a txt file
with open('../data/tweet_json.txt', 'w') as file:
file.write(json.dumps(my_list_of_dicts, indent=4))
# extract the needed variables and convert it to a pandas dataframe
tweet_list = []
with open('../data/tweet_json.txt', encoding='utf-8') as json_file:
all_data = json.load(json_file)
for each_dictionary in all_data:
tweet_id = each_dictionary['id']
text = each_dictionary['text']
favorite_count = each_dictionary['favorite_count']
retweet_count = each_dictionary['retweet_count']
created_at = each_dictionary['created_at']
tweet_list.append({'tweet_id': str(tweet_id),
'text': str(text),
'favorite_count': int(favorite_count),
'retweet_count': int(retweet_count),
'created_at': created_at,
})
tweet_json = pd.DataFrame(tweet_list, columns =
['tweet_id', 'text',
'favorite_count', 'retweet_count',
'created_at'])
tweet_json.shape
tweet_json.head()
# write dataframe to a csv file
tweet_json.to_csv('../data/tweet_dogs.csv', index=False)
# read tweet_dogs.csv (to avoid accessing the API every time)
tweets_df = pd.read_csv('../data/tweet_dogs.csv')
tweets_df.head(20)
The cleaning step is chosen first, to see more dirty data in single dataframes before joining all three dataframes. Usually Tidying would be the first step.
# copy every dataframe
twitter_archive_df_cleaned = twitter_archive_df.copy()
prediction_df_cleaned = prediction_df.copy()
tweets_df_cleaned = tweets_df.copy()
Get info about data types of every dataframe.
twitter_archive_df_cleaned.info()
prediction_df_cleaned.info()
tweets_df_cleaned.info()
Check if every tweet id is unique and there are no duplicates.
twitter_archive_df_cleaned['tweet_id'].nunique()
twitter_archive_df_cleaned['tweet_id'].duplicated().any()
Convert to datetime:
# convert date columns to datetime
twitter_archive_df_cleaned['timestamp'] = pd.to_datetime(twitter_archive_df_cleaned['timestamp'])
tweets_df_cleaned['created_at'] = pd.to_datetime(tweets_df_cleaned['created_at'])
print(type(twitter_archive_df_cleaned['timestamp'][0]), type(tweets_df_cleaned['created_at'][0]))
Make data type of doggo, floofer, pupper and puppo columns bool values:
# doggo, floofer, pupper and puppo columns to bool
#twitter_archive_df_cleaned['puppo'].unique() # to explore
twitter_archive_df_cleaned['doggo'].replace({'doggo': True, 'None': False}, inplace=True)
twitter_archive_df_cleaned['floofer'].replace({'floofer': True, 'None': False}, inplace=True)
twitter_archive_df_cleaned['pupper'].replace({'pupper': True, 'None': False}, inplace=True)
twitter_archive_df_cleaned['puppo'].replace({'puppo': True, 'None': False}, inplace=True)
print(twitter_archive_df_cleaned['doggo'].unique()) # to test
print(twitter_archive_df_cleaned['floofer'].unique()) # to test
print(twitter_archive_df_cleaned['pupper'].unique()) # to test
print(twitter_archive_df_cleaned['puppo'].unique()) # to test
Tweets without image url are no original rating. Remove them:
# tweets without image url are no original rating
#print(twitter_archive_df[twitter_archive_df['expanded_urls'].isnull()]) # for exploration
rows_before = twitter_archive_df_cleaned.shape[0]
twitter_archive_df_cleaned = twitter_archive_df_cleaned.dropna(axis=0, subset=['expanded_urls'])
print("Dropped ", rows_before-twitter_archive_df_cleaned.shape[0], " due to non-original rating tweets.")
Rating tweets without dog name are not wanted for this analysis later. Remove them:
# rating tweets without dog name
#twitter_archive_df[twitter_archive_df['name'] == "None"].shape[0] # to explore
rows_before = twitter_archive_df_cleaned.shape[0]
twitter_archive_df_cleaned = twitter_archive_df_cleaned[twitter_archive_df_cleaned['name'] != "None"]
print("Dropped ", rows_before-twitter_archive_df_cleaned.shape[0], " due to unknown dog name.")
Remove tweets with retweeted_status_id, because they are retweet-tweets and no original rating tweets.
# remove tweets with retweeted_status_id
#twitter_archive_df[~twitter_archive_df['retweeted_status_id'].isnull()] # for exploration
rows_before = twitter_archive_df_cleaned.shape[0]
twitter_archive_df_cleaned = twitter_archive_df_cleaned[twitter_archive_df_cleaned['retweeted_status_id'].isnull()]
print("Dropped ", rows_before-twitter_archive_df_cleaned.shape[0], " due to re-tweets.")
If the denominator is not 10, then it is no valid rating. This rows will be removed as well:
# no valid rating
#twitter_archive_df_cleaned[twitter_archive_df_cleaned.rating_denominator!= 10] # for exploration
rows_before = twitter_archive_df_cleaned.shape[0]
twitter_archive_df_cleaned = twitter_archive_df_cleaned[twitter_archive_df_cleaned.rating_denominator== 10]
print("Dropped ", rows_before-twitter_archive_df_cleaned.shape[0], " due to invalid rating denominator.")
If the neural network can not predict that it is a dog from the image, then it will be removed as well:
#prediction_df_cleaned[prediction_df.p1_dog == False] # for exploration
rows_before = prediction_df_cleaned.shape[0]
prediction_df_cleaned = prediction_df_cleaned[prediction_df.p1_dog == True]
print("Dropped ", rows_before-prediction_df_cleaned.shape[0], " due to no-dog prediction.")
Remove all tweets after August 1st 2017:
# drop all newer than August 1st 2017
#tweets_df_cleaned[tweets_df_cleaned.created_at > '2017-08-01'] # to explore
rows_before = tweets_df_cleaned.shape[0]
tweets_df_cleaned = tweets_df_cleaned[tweets_df_cleaned.created_at < '2017-08-01']
print("Dropped ", rows_before-tweets_df_cleaned.shape[0], " due to date limit")
rows_before = twitter_archive_df_cleaned.shape[0]
twitter_archive_df_cleaned = twitter_archive_df_cleaned[twitter_archive_df_cleaned.timestamp < '2017-08-01']
print("Dropped ", rows_before-twitter_archive_df_cleaned.shape[0], " due to date limit")
print("rows of twitter_archive_df_cleaned: ", twitter_archive_df_cleaned.shape[0])
print("rows of prediction_df_cleaned: ", prediction_df_cleaned.shape[0])
print("rows of tweets_df_cleaned: ", tweets_df_cleaned.shape[0])
print("columns of twitter_archive_df_cleaned: ", twitter_archive_df_cleaned.shape[1])
print("columns of prediction_df_cleaned: ", prediction_df_cleaned.shape[1])
print("columns of tweets_df_cleaned: ", tweets_df_cleaned.shape[1])
Merge all dataframes:
# join twitter_archive_df_cleaned and prediction_df_cleaned
joined_df = twitter_archive_df_cleaned.join(prediction_df_cleaned.set_index('tweet_id'), on='tweet_id', how='inner')
joined_df.shape
# join also with tweets_df_cleaned
joined_df = joined_df.join(tweets_df_cleaned.set_index('tweet_id'), on='tweet_id', how='inner', lsuffix='', rsuffix='_otweet')
joined_df.shape
joined_df.head(2)
Check if the duplicated column for timestamps have the same content:
# check if timestamps are all the same after joining
joined_df[joined_df['timestamp'] == joined_df['created_at']]
#joined_df[joined_df['jpg_url'] == joined_df['expanded_urls']] # for exploration
Remove all columns that are duplicates or that are not needed for later analysis:
# remove needless columns
joined_df.drop(columns=['created_at', 'in_reply_to_status_id', 'in_reply_to_user_id', 'source', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp', 'text_otweet', 'expanded_urls'], inplace=True)
joined_df
joined_df.columns
joined_df.to_csv('../data/twitter_archive_master.csv', index=False)
conn = sqlite3.connect("../data/twitter_archive_master.db")
joined_df.to_sql("twitter_archive_master", conn, if_exists="replace")
# test
df = pd.read_sql_query("select * from twitter_archive_master;", conn)
df.head()
the most rated values and the highest rated value:
df['rating_numerator'].value_counts()
SPOTTED: There is a error in the rate extraction from text, because in the tweet the dog has a rate of 9.75 not 75.
Also the rating 27 is meant as a 11.27 ratings. So they weren't extracted correctly. Now they must be cleaned as well.
#df[df['rating_numerator'] == 27] # test
rows = df.shape[0]
df = df[df['rating_numerator'] < 27]
print("Dropped ", rows-df.shape[0], ' after removing wrong computed ratings.')
The maximum value that was given as rating:
df['rating_numerator'].max()
The amount of tweets (dogs) that received the maximum rating:
df_best_rated = df[df['rating_numerator'] == df['rating_numerator'].max()]
df_best_rated = df_best_rated.reset_index(drop=True)
df_best_rated.shape
Those are the dogs that received the maxium rating 14/10:
for dog in range(df_best_rated.shape[0]):
dog_url = df_best_rated['jpg_url'][dog]
print('Dog name: ', df_best_rated['name'][dog])
print('Dog breed: ', df_best_rated['p1'][dog])
print('prediction rate: ', df_best_rated['p1_conf'][dog])
print('rate: ', df_best_rated['rating_numerator'][dog], '/ 10')
display(Image(dog_url, width=400, unconfined=True))
urllib.request.urlretrieve(dog_url, "images/best_rated_dog_"+str(dog)+".jpg")
The dog breed that is predicted the easiest and its amount in the dataset:
df_stat = pd.DataFrame(df.groupby('p1')['p1_conf'].mean())
df_stat['count'] = df.groupby('p1')['p1_conf'].count()
# the 10 best predicted dog breeds
df_stat.sort_values('p1_conf',ascending=False)[:10]
df_best_predicted = df[df['p1'] == 'komondor']
df_best_predicted = df_best_predicted.reset_index(drop=True)
Those are the dogs that belong to the best predicted breed:
for dog in range(df_best_predicted.shape[0]):
dog_url = df_best_predicted['jpg_url'][dog]
print('Dog name: ', df_best_predicted['name'][dog])
print('Dog breed: ', df_best_predicted['p1'][dog])
print('prediction: ', df_best_predicted['p1_conf'][dog])
print('rate: ', df_best_predicted['rating_numerator'][dog], '/ 10')
display(Image(dog_url, width=400, unconfined=True))
urllib.request.urlretrieve(dog_url, "images/best_predicted_breed_"+str(dog)+".jpg")
And here is the dog whose breed was best predicted: (also from the same breed as the best predicted breed)
# best predicted dog
df_best_predicted = df[df['p1_conf'] == df['p1_conf'].max()]
df_best_predicted = df_best_predicted.reset_index(drop=True)
df_best_predicted.shape
for dog in range(df_best_predicted.shape[0]):
dog_url = df_best_predicted['jpg_url'][dog]
print('Dog name: ', df_best_predicted['name'][dog])
print('Dog breed: ', df_best_predicted['p1'][dog])
print('prediction: ', df_best_predicted['p1_conf'][dog])
print('rate: ', df_best_predicted['rating_numerator'][dog], '/ 10')
print('retweet: ', df_best_predicted['retweet_count'][dog])
print('favorite: ', df_best_predicted['favorite_count'][dog])
display(Image(dog_url, width=400, unconfined=True))
urllib.request.urlretrieve(dog_url, "images/best_predicted_breed_"+str(dog)+".jpg")
The 10 most rated dog breeds:
df_stat.sort_values('count',ascending=False)[:10]
df_most_rated = df[df['p1'] == 'golden_retriever']
df_most_rated = df_most_rated.reset_index(drop=True)
df_most_rated.shape
Let's look at 10 dogs of the most rated breed in the dataset-the golden retriever:
# show only 10 of 95 dogs
for dog in range(10):
dog_url = df_most_rated['jpg_url'][dog]
print('Dog name: ', df_most_rated['name'][dog])
print('Dog breed: ', df_most_rated['p1'][dog])
print('prediction rate: ', df_most_rated['p1_conf'][dog])
print('rate: ', df_most_rated['rating_numerator'][dog], '/ 10')
display(Image(dog_url, width=400, unconfined=True))
urllib.request.urlretrieve(dog_url, "images/most_rated_breed"+str(dog)+".jpg")
The most favored dog from twitter favorite counts:
df_most_favorited = df[df['favorite_count'] == df['favorite_count'].max()]
df_most_favorited = df_most_favorited.reset_index(drop=True)
df_most_favorited.shape
for dog in range(df_most_favorited.shape[0]):
dog_url = df_most_favorited['jpg_url'][dog]
print('Dog name: ', df_most_favorited['name'][dog])
print('Dog breed: ', df_most_favorited['p1'][dog])
print('favorited: ', df_most_favorited['favorite_count'][dog])
print('prediction rate: ', df_most_rated['p1_conf'][dog])
print('rate: ', df_most_favorited['rating_numerator'][dog], '/ 10')
print('retweet: ', df_most_favorited['retweet_count'][dog])
print('favorite: ', df_most_favorited['favorite_count'][dog])
display(Image(dog_url, width=400, unconfined=True))
urllib.request.urlretrieve(dog_url, "images/df_most_favorited_"+str(dog)+".jpg")
How close are the other dogs with their favorite counts? Therefore we look at the histogram of the favorite counts:
df['favorite_count'].hist(bins=100)
Who is the most retweeded dog?
df_most_retweeted = df[df['retweet_count'] == df['retweet_count'].max()]
df_most_retweeted = df_most_retweeted.reset_index(drop=True)
df_most_retweeted.shape
for dog in range(df_most_retweeted.shape[0]):
dog_url = df_most_retweeted['jpg_url'][dog]
print('Dog name: ', df_most_retweeted['name'][dog])
print('Dog breed: ', df_most_retweeted['p1'][dog])
print('favorited: ', df_most_retweeted['favorite_count'][dog])
print('rate: ', df_most_retweeted['rating_numerator'][dog], '/ 10')
display(Image(dog_url, width=400, unconfined=True))
urllib.request.urlretrieve(dog_url, "images/df_most_retweeted_"+str(dog)+".jpg")
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(20,12)})
sns.set(font_scale=2.0)
sns.set_palette("pastel")
df_words = df[['puppo', 'doggo', 'floofer', 'pupper']]
df_words.apply(pd.value_counts).plot(kind='bar',
title='all types')
fig = ax.get_figure()
fig.savefig("plots/dog_vocabularies.png")
df['timestamp'] = pd.to_datetime(df['timestamp'])
ax = sns.scatterplot(x="timestamp", y="rating_numerator",
hue="retweet_count", size="retweet_count",alpha=.8,
data=df)
ax.set_xlim(df['timestamp'].min(), df['timestamp'].max())
fig = ax.get_figure()
fig.savefig("plots/retweet_rating_over_time_b.png")
The ratings have changed over time. The ratings have increased.
ax = sns.scatterplot(x="timestamp", y="rating_numerator",
hue="favorite_count", size="favorite_count",alpha=.8,
data=df)
ax.set_xlim(df['timestamp'].min(), df['timestamp'].max())
fig = ax.get_figure()
fig.savefig("plots/favorit_rating_over_time_b.png")
If we look at the favorite count and the retweet count in the diagrams above, then we can see, that they might be similar.
ax = sns.scatterplot(x="favorite_count", y="retweet_count",hue="rating_numerator",
alpha=.8,
data=df)
fig = ax.get_figure()
fig.savefig("plots/retweet_favorit_scatter_b.png")
The retweet and favorite count correlate.
ax = sns.scatterplot(x="rating_numerator", y="retweet_count",hue="favorite_count",
alpha=.8,
data=df)
fig = ax.get_figure()
fig.savefig("plots/retweet_rating_scatter_b.png")
The low rating values are not retweeted well. The higher rating values (x-axis) are retweeted more often (y-axis). And also favored more often (color).
ax = sns.scatterplot(x="favorite_count", y="p1_conf",
alpha=.8,
data=df)
fig = ax.get_figure()
fig.savefig("plots/favorite_p1conf_scatter.png")
There is no correlatin between the prediction confidence of the neural network and the favorite count.